{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e25d5aad-a572-4b7a-bd57-1bff16c24e18",
   "metadata": {},
   "source": [
    "## Handling Imbalanced Dataset\n",
    "\n",
    "1. Up Sampling\n",
    "2. Down Sampling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "109e96e0-08bf-4813-909c-e6fd5fdff853",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# Set the random seed for reproducibility\n",
    "np.random.seed(123)\n",
    "\n",
    "# Create a dataframe with two classes\n",
    "n_samples = 1000\n",
    "class_0_ratio = 0.9\n",
    "n_class_0 = int(n_samples * class_0_ratio)\n",
    "n_class_1 = n_samples - n_class_0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "abde452c-b1f1-4bd7-b9d6-0e73883d81b2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(900, 100)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n_class_0,n_class_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "04d03122-c2e2-4ef8-9dda-9ed8c8d99eed",
   "metadata": {},
   "outputs": [],
   "source": [
    "## CREATE MY DATAFRAME WITH IMBALANCED DATASET\n",
    "class_0 = pd.DataFrame({\n",
    "    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),\n",
    "    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),\n",
    "    'target': [0] * n_class_0\n",
    "})\n",
    "\n",
    "class_1 = pd.DataFrame({\n",
    "    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),\n",
    "    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),\n",
    "    'target': [1] * n_class_1\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f8e0833a-ae62-4620-81be-d6f8af6814ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "df=pd.concat([class_0,class_1]).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "58f414a8-985c-467b-96e3-df40974714f7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feature_1</th>\n",
       "      <th>feature_2</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>995</th>\n",
       "      <td>1.376371</td>\n",
       "      <td>2.845701</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>996</th>\n",
       "      <td>2.239810</td>\n",
       "      <td>0.880077</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>997</th>\n",
       "      <td>1.131760</td>\n",
       "      <td>1.640703</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>998</th>\n",
       "      <td>2.902006</td>\n",
       "      <td>0.390305</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999</th>\n",
       "      <td>2.697490</td>\n",
       "      <td>2.013570</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     feature_1  feature_2  target\n",
       "995   1.376371   2.845701       1\n",
       "996   2.239810   0.880077       1\n",
       "997   1.131760   1.640703       1\n",
       "998   2.902006   0.390305       1\n",
       "999   2.697490   2.013570       1"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b989d68f-f38b-4edf-ab4b-7be643574f55",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    900\n",
       "1    100\n",
       "Name: target, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['target'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "14516002-ea46-43fe-9842-13bbfe816d29",
   "metadata": {},
   "outputs": [],
   "source": [
    "## upsampling\n",
    "df_minority=df[df['target']==1]\n",
    "df_majority=df[df['target']==0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "fe628e67-5c13-4040-becb-8a782b2470d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.utils import resample\n",
    "df_minority_upsampled=resample(df_minority,replace=True, #Sample With replacement\n",
    "         n_samples=len(df_majority),\n",
    "         random_state=42\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "6d742a1a-059c-43d6-accc-638acf200be0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(900, 3)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_minority_upsampled.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "56d2a178-0dd0-4702-9007-76555c02737d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feature_1</th>\n",
       "      <th>feature_2</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>951</th>\n",
       "      <td>1.125854</td>\n",
       "      <td>1.843917</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>992</th>\n",
       "      <td>2.196570</td>\n",
       "      <td>1.397425</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>914</th>\n",
       "      <td>1.932170</td>\n",
       "      <td>2.998053</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>971</th>\n",
       "      <td>2.272825</td>\n",
       "      <td>3.034197</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>960</th>\n",
       "      <td>2.870056</td>\n",
       "      <td>1.550485</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     feature_1  feature_2  target\n",
       "951   1.125854   1.843917       1\n",
       "992   2.196570   1.397425       1\n",
       "914   1.932170   2.998053       1\n",
       "971   2.272825   3.034197       1\n",
       "960   2.870056   1.550485       1"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_minority_upsampled.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "7f2a6ef0-8023-43d0-af56-0bbe4e96b974",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_upsampled=pd.concat([df_majority,df_minority_upsampled])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "f47db4ac-1ef4-476a-8dfb-3b2374c0bdd8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    900\n",
       "1    900\n",
       "Name: target, dtype: int64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_upsampled['target'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5fd4ee19-5a3d-4bc9-97dc-3939f85c37c1",
   "metadata": {},
   "source": [
    "## Down Sampling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "f986d088-eb6c-48b7-820d-56c7928f900d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    900\n",
      "1    100\n",
      "Name: target, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Set the random seed for reproducibility\n",
    "np.random.seed(123)\n",
    "\n",
    "# Create a dataframe with two classes\n",
    "n_samples = 1000\n",
    "class_0_ratio = 0.9\n",
    "n_class_0 = int(n_samples * class_0_ratio)\n",
    "n_class_1 = n_samples - n_class_0\n",
    "\n",
    "class_0 = pd.DataFrame({\n",
    "    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),\n",
    "    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),\n",
    "    'target': [0] * n_class_0\n",
    "})\n",
    "\n",
    "class_1 = pd.DataFrame({\n",
    "    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),\n",
    "    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),\n",
    "    'target': [1] * n_class_1\n",
    "})\n",
    "\n",
    "df = pd.concat([class_0, class_1]).reset_index(drop=True)\n",
    "\n",
    "# Check the class distribution\n",
    "print(df['target'].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "93cd5e60-5138-4029-9b07-c88d024ce7c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "## downsampling\n",
    "df_minority=df[df['target']==1]\n",
    "df_majority=df[df['target']==0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29f660cd-ea9e-488d-820a-80d4189718f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.utils import resample\n",
    "df_majority_upsampled=resample(df_minority,replace=True, #Sample With replacement\n",
    "         n_samples=len(df_majority),\n",
    "         random_state=42\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5db812b7-e9b6-4ac5-936e-1dce2697ae8a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "329ddfb6-2c1a-4b89-a973-ada272c17a1e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
